// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "content/common/cross_site_document_classifier.h"

#include "base/command_line.h"
#include "base/lazy_instance.h"
#include "base/logging.h"
#include "base/macros.h"
#include "base/metrics/histogram_macros.h"
#include "base/strings/string_util.h"
#include "content/public/common/content_switches.h"
#include "content/public/common/resource_response_info.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
#include "net/http/http_response_headers.h"

using base::StringPiece;

namespace content {

namespace {

    // MIME types
    const char kTextHtml[] = "text/html";
    const char kTextXml[] = "text/xml";
    const char kAppRssXml[] = "application/rss+xml";
    const char kAppXml[] = "application/xml";
    const char kAppJson[] = "application/json";
    const char kTextJson[] = "text/json";
    const char kTextXjson[] = "text/x-json";
    const char kTextPlain[] = "text/plain";

    bool MatchesSignature(StringPiece data,
        const StringPiece signatures[],
        size_t arr_size)
    {
        size_t offset = data.find_first_not_of(" \t\r\n");
        // There is no not-whitespace character in this document.
        if (offset == base::StringPiece::npos)
            return false;

        data.remove_prefix(offset);
        for (size_t sig_index = 0; sig_index < arr_size; ++sig_index) {
            if (base::StartsWith(data, signatures[sig_index],
                    base::CompareCase::INSENSITIVE_ASCII))
                return true;
        }
        return false;
    }

} // namespace

CrossSiteDocumentMimeType CrossSiteDocumentClassifier::GetCanonicalMimeType(
    const std::string& mime_type)
{
    if (base::LowerCaseEqualsASCII(mime_type, kTextHtml)) {
        return CROSS_SITE_DOCUMENT_MIME_TYPE_HTML;
    }

    if (base::LowerCaseEqualsASCII(mime_type, kTextPlain)) {
        return CROSS_SITE_DOCUMENT_MIME_TYPE_PLAIN;
    }

    if (base::LowerCaseEqualsASCII(mime_type, kAppJson) || base::LowerCaseEqualsASCII(mime_type, kTextJson) || base::LowerCaseEqualsASCII(mime_type, kTextXjson)) {
        return CROSS_SITE_DOCUMENT_MIME_TYPE_JSON;
    }

    if (base::LowerCaseEqualsASCII(mime_type, kTextXml) || base::LowerCaseEqualsASCII(mime_type, kAppRssXml) || base::LowerCaseEqualsASCII(mime_type, kAppXml)) {
        return CROSS_SITE_DOCUMENT_MIME_TYPE_XML;
    }

    return CROSS_SITE_DOCUMENT_MIME_TYPE_OTHERS;
}

bool CrossSiteDocumentClassifier::IsBlockableScheme(const GURL& url)
{
    // We exclude ftp:// from here. FTP doesn't provide a Content-Type
    // header which our policy depends on, so we cannot protect any
    // document from FTP servers.
    return url.SchemeIs(url::kHttpScheme) || url.SchemeIs(url::kHttpsScheme);
}

bool CrossSiteDocumentClassifier::IsSameSite(const url::Origin& frame_origin,
    const GURL& response_url)
{
    if (frame_origin.unique() || !response_url.is_valid())
        return false;

    if (frame_origin.scheme() != response_url.scheme())
        return false;

    // SameDomainOrHost() extracts the effective domains (public suffix plus one)
    // from the two URLs and compare them.
    return net::registry_controlled_domains::SameDomainOrHost(
        response_url, frame_origin,
        net::registry_controlled_domains::INCLUDE_PRIVATE_REGISTRIES);
}

// We don't use Webkit's existing CORS policy implementation since
// their policy works in terms of origins, not sites. For example,
// when frame is sub.a.com and it is not allowed to access a document
// with sub1.a.com. But under Site Isolation, it's allowed.
bool CrossSiteDocumentClassifier::IsValidCorsHeaderSet(
    const url::Origin& frame_origin,
    const GURL& website_origin,
    const std::string& access_control_origin)
{
    // Many websites are sending back "\"*\"" instead of "*". This is
    // non-standard practice, and not supported by Chrome. Refer to
    // CrossOriginAccessControl::passesAccessControlCheck().

    // TODO(dsjang): * is not allowed for the response from a request
    // with cookies. This allows for more than what the renderer will
    // eventually be able to receive, so we won't see illegal cross-site
    // documents allowed by this. We have to find a way to see if this
    // response is from a cookie-tagged request or not in the future.
    if (access_control_origin == "*")
        return true;

    // TODO(dsjang): The CORS spec only treats a fully specified URL, except for
    // "*", but many websites are using just a domain for access_control_origin,
    // and this is blocked by Webkit's CORS logic here :
    // CrossOriginAccessControl::passesAccessControlCheck(). GURL is set
    // is_valid() to false when it is created from a URL containing * in the
    // domain part.

    GURL cors_origin(access_control_origin);
    return IsSameSite(frame_origin, cors_origin);
}

// This function is a slight modification of |net::SniffForHTML|.
bool CrossSiteDocumentClassifier::SniffForHTML(StringPiece data)
{
    // The content sniffer used by Chrome and Firefox are using "<!--"
    // as one of the HTML signatures, but it also appears in valid
    // JavaScript, considered as well-formed JS by the browser.  Since
    // we do not want to block any JS, we exclude it from our HTML
    // signatures. This can weaken our document block policy, but we can
    // break less websites.
    // TODO(dsjang): parameterize |net::SniffForHTML| with an option
#include <stddef.h>

    // that decides whether to include <!-- or not, so that we can
    // remove this function.
    // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
    // process, we should do single-thread checking here for the static
    // initializer.
    static const StringPiece kHtmlSignatures[] = {
        StringPiece("<!doctype html"), // HTML5 spec
        StringPiece("<script"), // HTML5 spec, Mozilla
        StringPiece("<html"), // HTML5 spec, Mozilla
        StringPiece("<head"), // HTML5 spec, Mozilla
        StringPiece("<iframe"), // Mozilla
        StringPiece("<h1"), // Mozilla
        StringPiece("<div"), // Mozilla
        StringPiece("<font"), // Mozilla
        StringPiece("<table"), // Mozilla
        StringPiece("<a"), // Mozilla
        StringPiece("<style"), // Mozilla
        StringPiece("<title"), // Mozilla
        StringPiece("<b"), // Mozilla
        StringPiece("<body"), // Mozilla
        StringPiece("<br"), // Mozilla
        StringPiece("<p") // Mozilla
    };

    while (data.length() > 0) {
        if (MatchesSignature(data, kHtmlSignatures, arraysize(kHtmlSignatures)))
            return true;

        // If we cannot find "<!--", we fail sniffing this as HTML.
        static const StringPiece kCommentBegins[] = { StringPiece("<!--") };
        if (!MatchesSignature(data, kCommentBegins, arraysize(kCommentBegins)))
            break;

        // Search for --> and do SniffForHTML after that. If we can find the
        // comment's end, we start HTML sniffing from there again.
        static const char kEndComment[] = "-->";
        size_t offset = data.find(kEndComment);
        if (offset == base::StringPiece::npos)
            break;

        // Proceed to the index next to the ending comment (-->).
        data.remove_prefix(offset + strlen(kEndComment));
    }

    return false;
}

bool CrossSiteDocumentClassifier::SniffForXML(base::StringPiece data)
{
    // TODO(dsjang): Chrome's mime_sniffer is using strncasecmp() for
    // this signature. However, XML is case-sensitive. Don't we have to
    // be more lenient only to block documents starting with the exact
    // string <?xml rather than <?XML ?
    // TODO(dsjang): Once CrossSiteDocumentClassifier is moved into the browser
    // process, we should do single-thread checking here for the static
    // initializer.
    static const StringPiece kXmlSignatures[] = { StringPiece("<?xml") };
    return MatchesSignature(data, kXmlSignatures, arraysize(kXmlSignatures));
}

bool CrossSiteDocumentClassifier::SniffForJSON(base::StringPiece data)
{
    // TODO(dsjang): We have to come up with a better way to sniff
    // JSON. However, even RE cannot help us that much due to the fact
    // that we don't do full parsing.  This DFA starts with state 0, and
    // finds {, "/' and : in that order. We're avoiding adding a
    // dependency on a regular expression library.
    enum {
        kStartState,
        kLeftBraceState,
        kLeftQuoteState,
        kColonState,
        kTerminalState,
    } state
        = kStartState;

    size_t length = data.length();
    for (size_t i = 0; i < length && state < kColonState; ++i) {
        const char c = data[i];
        if (c == ' ' || c == '\t' || c == '\r' || c == '\n')
            continue;

        switch (state) {
        case kStartState:
            if (c == '{')
                state = kLeftBraceState;
            else
                state = kTerminalState;
            break;
        case kLeftBraceState:
            if (c == '\"' || c == '\'')
                state = kLeftQuoteState;
            else
                state = kTerminalState;
            break;
        case kLeftQuoteState:
            if (c == ':')
                state = kColonState;
            break;
        case kColonState:
        case kTerminalState:
            NOTREACHED();
            break;
        }
    }
    return state == kColonState;
}

} // namespace content
